Machine Learning Algorithms
INDEX
1.Linear regression for Prediction of Suicide count from 2013-2033
2.With respect to Causes,Year
3.State,Year
4.Age grp,Year
5.Wrt State,Age grp,Year Culminated Model
0)Libraries,Extraction , cleaning and separating:
df<-read.table('Suicides in India 2001-2012.csv',header=T, sep=",")
#head(df)
#install.packages('tidyverse')
#install.packages('caret')
#install.packages("plotly")
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.1
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.3 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## Warning: package 'readr' was built under R version 4.1.1
## Warning: package 'stringr' was built under R version 4.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)
## Warning: package 'caret' was built under R version 4.1.1
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(dplyr)
library(ggplot2)
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.1
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.1.1
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-2
#Data Cleaning
#Replacing Values for UT
df["State"][df["State"] == "A & N Islands"]<-"A & N Islands-Ut"
df["State"][df["State"] == "Chandigarh"]<-"Chandigarh-Ut"
df["State"][df["State"] == "D & N Haveli"]<-"D & N Haveli-Ut"
df["State"][df["State"] == "Daman & Diu"]<-"Daman & Diu-Ut"
df["State"][df["State"] == "Lakshadweep"]<-"Lakshadweep-Ut"
df["State"][df["State"] == "Delhi"] <-"Delhi-Ut"
#head(df)
#Renaming causes
df["Type"][df["Type"]=="Bankruptcy or Sudden change in Economic"]<-"Sudden change in Economic Status or Bankruptcy"
df["Type"][df["Type"]=="By Other means (please specify)"]<-"By Other means"
df["Type"][df["Type"]=="Not having Children(Barrenness/Impotency"]<-"Not having Children(Impotency)"
df["Type"][df["Type"]=="By Jumping from (Building)"]<-"By Jumping from Building"
df["Type"][df["Type"]=="Hr. Secondary/Intermediate/Pre-Universit"]<-"Hr. Secondary/Intermediate/Pre-University"
df["Type"][df["Type"]=="Failure in Examination"]<-"Examination Failure"
df["Type"][df["Type"]=="By coming under running vehicles/trains"]<-"By road or railway accidents"
df["Type"][df["Type"]=="Bankruptcy or Sudden change in Economic Status"]<-"Sudden change in Economic Status or Bankruptcy"
df["Type"][df["Type"]=="Not having Children (Barrenness/Impotency"]<-"Not having Children(Impotency)"
#causescount
#head(df)
#drop the unwanted State-titles
df1 <- df[!(df$State=="Total (Uts)" | df$State=="Total (All India)" | df$State=="Total (States)"),]
#drop the values ==0 under Total
df2 <- df1[!(df1$Total==0),]
# drop the unwanted Types
df2 <- df2[!(df2$Type=="By Other means" | df2$Type=="Others (Please Specify)" | df2$Type=="Causes Not known" | df2$Type=="Other Causes (Please Specity)"),]
#Spliting the dataframe into smaller dataframe based on the column "Type_code"
causesdf=filter(df2,df2$Type_code=="Causes")
edudf=filter(df2,df2$Type_code=="Education_Status")
meansdf=filter(df2,df2$Type_code=="Means_adopted")
professionaldf=filter(df2,df2$Type_code=="Professional_Profile")
socialdf=filter(df2,df2$Type_code=="Social_Status")
1.1)Linear Regression for prediction for suicide count from 2013-2033
# Extracting the needed year and suicide count columns
suicide_count_overyears <- df2 %>% group_by(Year) %>% summarise(total_case=sum(Total))
suicide_count_overyears
## # A tibble: 12 x 2
## Year total_case
## <int> <int>
## 1 2001 467928
## 2 2002 476738
## 3 2003 482322
## 4 2004 486323
## 5 2005 486115
## 6 2006 512676
## 7 2007 522233
## 8 2008 531216
## 9 2009 539470
## 10 2010 564083
## 11 2011 564376
## 12 2012 547894
#Testing co relation
cor(suicide_count_overyears$Year,suicide_count_overyears$total_case)
## [1] 0.962955
cor.test(suicide_count_overyears$Year,suicide_count_overyears$total_case)
##
## Pearson's product-moment correlation
##
## data: suicide_count_overyears$Year and suicide_count_overyears$total_case
## t = 11.292, df = 10, p-value = 5.163e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8696678 0.9898336
## sample estimates:
## cor
## 0.962955
#Partitioning into train and test
set.seed(123)
train_samples <- suicide_count_overyears$Year %>%
createDataPartition(p=0.65,list=FALSE)
#train_samples
head(train_samples)
## Resample1
## [1,] 1
## [2,] 3
## [3,] 5
## [4,] 6
## [5,] 8
## [6,] 9
train <- suicide_count_overyears[train_samples,]
test <- suicide_count_overyears[-train_samples,]
#train
#test
#suicide_count_overyears
# LR Model creation
model <- lm(total_case~Year,data=train)
summary(model)
##
## Call:
## lm(formula = total_case ~ Year, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13947 -3539 3390 4469 11714
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17077042 1946684 -8.772 0.000122 ***
## Year 8767 970 9.038 0.000103 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9839 on 6 degrees of freedom
## Multiple R-squared: 0.9316, Adjusted R-squared: 0.9202
## F-statistic: 81.68 on 1 and 6 DF, p-value: 0.0001028
#Make predictions
pred <- model %>%
predict(test)
pred
## 1 2 3 4
## 473762.3 491295.5 517595.5 543895.4
#Verification with actual value and predicted values
RMSE <- RMSE(pred,test$total_case)
RMSE
## [1] 10754.4
R2 <- R2(pred,test$total_case)
R2
## [1] 0.9753916
hist(model$residuals)

qqnorm(model$residuals,ylab = "Residuals")
qqline(model$residuals)

# Prediction
future_years=data.frame(Year=c(2013:2033))
future_years$total_case <- model %>%
predict(future_years)
future_years
## Year total_case
## 1 2013 570195.3
## 2 2014 578961.9
## 3 2015 587728.5
## 4 2016 596495.2
## 5 2017 605261.8
## 6 2018 614028.4
## 7 2019 622795.1
## 8 2020 631561.7
## 9 2021 640328.4
## 10 2022 649095.0
## 11 2023 657861.6
## 12 2024 666628.3
## 13 2025 675394.9
## 14 2026 684161.5
## 15 2027 692928.2
## 16 2028 701694.8
## 17 2029 710461.4
## 18 2030 719228.1
## 19 2031 727994.7
## 20 2032 736761.3
## 21 2033 745528.0
# Bar plot for rise in suicide cases
total_suicide_count <- rbind(suicide_count_overyears,future_years)
#total
fig <- plot_ly(
x = total_suicide_count$Year,
y = total_suicide_count$total_case,
name = "Variation of suicide count over the years 2013-2032",
type = "bar",
)
fig <- fig %>% layout(title = "Suicide Trend Over the years",
barmode = 'group',
xaxis = list(title = "Years"),
yaxis = list(title = "Count"))
fig
1.2)Logistic Regression for prediction for suicide count from 2013-2033
# Extracting the needed year and suicide count columns
suicide_count_overyears <- df2 %>% group_by(Year) %>% summarise(total_case=sum(Total))
suicide_count_overyears
## # A tibble: 12 x 2
## Year total_case
## <int> <int>
## 1 2001 467928
## 2 2002 476738
## 3 2003 482322
## 4 2004 486323
## 5 2005 486115
## 6 2006 512676
## 7 2007 522233
## 8 2008 531216
## 9 2009 539470
## 10 2010 564083
## 11 2011 564376
## 12 2012 547894
#Testing co relation
cor(suicide_count_overyears$Year,suicide_count_overyears$total_case)
## [1] 0.962955
cor.test(suicide_count_overyears$Year,suicide_count_overyears$total_case)
##
## Pearson's product-moment correlation
##
## data: suicide_count_overyears$Year and suicide_count_overyears$total_case
## t = 11.292, df = 10, p-value = 5.163e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8696678 0.9898336
## sample estimates:
## cor
## 0.962955
#Partitioning into train and test
set.seed(123)
train_samples <- suicide_count_overyears$Year %>%
createDataPartition(p=0.65,list=FALSE)
#train_samples
head(train_samples)
## Resample1
## [1,] 1
## [2,] 3
## [3,] 5
## [4,] 6
## [5,] 8
## [6,] 9
train <- suicide_count_overyears[train_samples,]
test <- suicide_count_overyears[-train_samples,]
train
## # A tibble: 8 x 2
## Year total_case
## <int> <int>
## 1 2001 467928
## 2 2003 482322
## 3 2005 486115
## 4 2006 512676
## 5 2008 531216
## 6 2009 539470
## 7 2011 564376
## 8 2012 547894
test
## # A tibble: 4 x 2
## Year total_case
## <int> <int>
## 1 2002 476738
## 2 2004 486323
## 3 2007 522233
## 4 2010 564083
as.factor(suicide_count_overyears$Total)
## Warning: Unknown or uninitialised column: `Total`.
## factor(0)
## Levels:
# Training model
logistic_modelLR <- glm(total_case~Year, data = train)
logistic_modelLR
##
## Call: glm(formula = total_case ~ Year, data = train)
##
## Coefficients:
## (Intercept) Year
## -17077042 8767
##
## Degrees of Freedom: 7 Total (i.e. Null); 6 Residual
## Null Deviance: 8.487e+09
## Residual Deviance: 580800000 AIC: 173.5
pred <- logistic_modelLR %>%
predict(test)
pred
## 1 2 3 4
## 473762.3 491295.5 517595.5 543895.4
RMSE <- RMSE(pred,test$total_case)
RMSE
## [1] 10754.4
R2 <- R2(pred,test$total_case)
R2
## [1] 0.9753916
test
## # A tibble: 4 x 2
## Year total_case
## <int> <int>
## 1 2002 476738
## 2 2004 486323
## 3 2007 522233
## 4 2010 564083
2)CAUSES Filtration
topcauses<-causesdf%>%select(Type,Year,Total) %>% group_by(Type)%>%
summarise(Total=sum(Total)) %>% arrange(desc(Total))%>% head(10)
topcauses<-as.data.frame(topcauses)
head(topcauses)
## Type Total
## 1 Family Problems 341952
## 2 Other Prolonged Illness 194565
## 3 Insanity/Mental Illness 94229
## 4 Love Affairs 45039
## 5 Sudden change in Economic Status or Bankruptcy 35410
## 6 Poverty 32684
topcauses1<-causesdf%>%select(Type,Year,Total) %>% group_by(Year,Type="Family Problems")%>%
summarise(Total=sum(Total)) %>% arrange(desc(Type))
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
head(topcauses1)
## # A tibble: 6 x 3
## # Groups: Year [6]
## Year Type Total
## <int> <chr> <int>
## 1 2001 Family Problems 74067
## 2 2002 Family Problems 75891
## 3 2003 Family Problems 78419
## 4 2004 Family Problems 78690
## 5 2005 Family Problems 77022
## 6 2006 Family Problems 85675
topcauses2<-causesdf%>%select(Type,Year,Total) %>% group_by(Year,Type="Other Prolonged Illness")%>%
summarise(Total=sum(Total)) %>% arrange(desc(Type))
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
head(topcauses2)
## # A tibble: 6 x 3
## # Groups: Year [6]
## Year Type Total
## <int> <chr> <int>
## 1 2001 Other Prolonged Illness 74067
## 2 2002 Other Prolonged Illness 75891
## 3 2003 Other Prolonged Illness 78419
## 4 2004 Other Prolonged Illness 78690
## 5 2005 Other Prolonged Illness 77022
## 6 2006 Other Prolonged Illness 85675
topcauses3<-causesdf%>%select(Type,Year,Total) %>% group_by(Year,Type="Insanity/Mental Illness")%>%
summarise(Total=sum(Total)) %>% arrange(desc(Type))
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
head(topcauses3)
## # A tibble: 6 x 3
## # Groups: Year [6]
## Year Type Total
## <int> <chr> <int>
## 1 2001 Insanity/Mental Illness 74067
## 2 2002 Insanity/Mental Illness 75891
## 3 2003 Insanity/Mental Illness 78419
## 4 2004 Insanity/Mental Illness 78690
## 5 2005 Insanity/Mental Illness 77022
## 6 2006 Insanity/Mental Illness 85675
topcauses4<-causesdf%>%select(Type,Year,Total) %>% group_by(Year,Type="Love Affairs")%>%
summarise(Total=sum(Total)) %>% arrange(desc(Type))
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
head(topcauses4)
## # A tibble: 6 x 3
## # Groups: Year [6]
## Year Type Total
## <int> <chr> <int>
## 1 2001 Love Affairs 74067
## 2 2002 Love Affairs 75891
## 3 2003 Love Affairs 78419
## 4 2004 Love Affairs 78690
## 5 2005 Love Affairs 77022
## 6 2006 Love Affairs 85675
topcauses5<-causesdf%>%select(Type,Year,Total) %>% group_by(Year,Type="Sudden change in Economic Status or Bankruptcy")%>%summarise(Total=sum(Total)) %>% arrange(desc(Type))
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
head(topcauses5)
## # A tibble: 6 x 3
## # Groups: Year [6]
## Year Type Total
## <int> <chr> <int>
## 1 2001 Sudden change in Economic Status or Bankruptcy 74067
## 2 2002 Sudden change in Economic Status or Bankruptcy 75891
## 3 2003 Sudden change in Economic Status or Bankruptcy 78419
## 4 2004 Sudden change in Economic Status or Bankruptcy 78690
## 5 2005 Sudden change in Economic Status or Bankruptcy 77022
## 6 2006 Sudden change in Economic Status or Bankruptcy 85675
top5causes=rbind(topcauses1,topcauses2,topcauses3,topcauses4,topcauses5)
2.1)CAUSES MLR
set.seed(123)
train_samples <- top5causes$Year %>%
createDataPartition(p=0.70,list=FALSE)
train <- top5causes[train_samples,]
test <- top5causes[-train_samples,]
# LR Model creation
model <- lm(Total~Year+Type,data=train)
summary(model)
##
## Call:
## lm(formula = Total ~ Year + Type, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7283.0 -2113.2 360.6 2606.4 4655.2
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -2101988.0 308256.4
## Year 1088.5 153.7
## TypeInsanity/Mental Illness 745.2 1718.4
## TypeLove Affairs 367.2 1542.1
## TypeOther Prolonged Illness -710.7 1636.9
## TypeSudden change in Economic Status or Bankruptcy -511.1 1638.1
## t value Pr(>|t|)
## (Intercept) -6.819 4.33e-08 ***
## Year 7.084 1.89e-08 ***
## TypeInsanity/Mental Illness 0.434 0.667
## TypeLove Affairs 0.238 0.813
## TypeOther Prolonged Illness -0.434 0.667
## TypeSudden change in Economic Status or Bankruptcy -0.312 0.757
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3523 on 38 degrees of freedom
## Multiple R-squared: 0.5854, Adjusted R-squared: 0.5309
## F-statistic: 10.73 on 5 and 38 DF, p-value: 1.754e-06
#Make predictions
pred <- model %>%
predict(test)
pred
## 1 2 3 4 5 6 7 8
## 86973.33 78643.20 80820.19 84085.67 85174.16 77922.11 79010.60 80099.09
## 9 10 11 12 13 14 15 16
## 81187.59 87718.55 77544.08 85163.54 75577.28 82108.25 83196.74 85373.73
#Verification with actual value and predicted values
RMSE <- RMSE(pred,test$Total)
RMSE
## [1] 2926.94
R2 <- R2(pred,test$Total)
R2
## [1] 0.8575155
hist(model$residuals)

qqnorm(model$residuals,ylab = "Residuals")
qqline(model$residuals)

# Prediction
new.speeds <- data.frame(
Year = c(2023, 2024, 2025) , Type = c("Family Problems","Love Affairs","Other Prolonged Illness")
)
#(agedf)
predict(model, newdata = new.speeds)
## 1 2 3
## 100035.3 101490.9 101501.6
future_years
## Year total_case
## 1 2013 570195.3
## 2 2014 578961.9
## 3 2015 587728.5
## 4 2016 596495.2
## 5 2017 605261.8
## 6 2018 614028.4
## 7 2019 622795.1
## 8 2020 631561.7
## 9 2021 640328.4
## 10 2022 649095.0
## 11 2023 657861.6
## 12 2024 666628.3
## 13 2025 675394.9
## 14 2026 684161.5
## 15 2027 692928.2
## 16 2028 701694.8
## 17 2029 710461.4
## 18 2030 719228.1
## 19 2031 727994.7
## 20 2032 736761.3
## 21 2033 745528.0
2.2)Causes Lasso
#Testing co relation
#cor(suicide_count_overyears$Year,suicide_count_overyears$total_case)
#cor.test(suicide_count_overyears$Year,suicide_count_overyears$total_case)
#Partitioning into train and test
set.seed(123)
train_samples <- top5causes$Total %>%
createDataPartition(p=0.80,list=FALSE)
train <- top5causes[train_samples,]
test <- top5causes[-train_samples,]
#agedf
#install.packages("glmnet")
#perform k-fold cross-validation to find optimal lambda value
cv_model <- cv.glmnet(data.matrix(train[, c('Year','Type')]), train$Total, alpha = 0.5)
cv_model
##
## Call: cv.glmnet(x = data.matrix(train[, c("Year", "Type")]), y = train$Total, alpha = 0.5)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 546.3 30 11625269 1598685 1
## 1se 1668.4 18 12943740 1230864 1
#find optimal lambda value that minimizes test MSE
best_lambda <- cv_model$lambda.min
best_lambda
## [1] 546.3268
#[1] best_lambda=546.3268
#produce plot of test MSE by lambda value
plot(cv_model)

#Best Lasso model
#t=data.matrix(train[, c('Year','State','Age_group')])
#t
best_model <- glmnet(data.matrix(train[, c('Year','Type')]), train$Total, alpha = 0.5, lambda = best_lambda)
coef(best_model)
## 3 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -1973496.614
## Year 1024.534
## Type .
#Prediction
#define new observation
#new = matrix(c(2015,'Maharashtra',"45-59"), nrow=1, ncol=3)
#data.matrix(c(2015,'Maharashtra',"45-59"))
#new
#use lasso regression model to predict response value
#predict(best_model, s = best_lambda, newx = new)
x=data.matrix(test[, c('Year','Type')])
#x
y=test$Total
#Metrics
y_predicted <- predict(best_model, s = best_lambda, newx = x)
RMSE <- RMSE(y,y_predicted)
RMSE
## [1] 2539.525
R2 <- R2(y,y_predicted)
R2
## [,1]
## s1 0.9194946
#find SST and SSE
#sst <- sum((y - mean(y))^2)
#sse <- sum((y_predicted - y)^2)
#find R-Squared
#rsq <- 1-sse/sst
#rsq
2.3)SVM for Causes
library(e1071)
## Warning: package 'e1071' was built under R version 4.1.1
set.seed(123)
train_samples <- top5causes$Total %>% createDataPartition(p=0.65,list=FALSE)
train <- top5causes[train_samples,]
train$Type<-as.factor(train$Type)
test <- top5causes[-train_samples,]
test$Type<-as.factor(test$Type)
#agedf
# MLR Model creation
causessvm <- svm(Total~Year+Type,data=train, kernel = 'linear')
summary(causessvm)
##
## Call:
## svm(formula = Total ~ Year + Type, data = train, kernel = "linear")
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: linear
## cost: 1
## gamma: 0.1666667
## epsilon: 0.1
##
##
## Number of Support Vectors: 38
#Make predictions
pred <- causessvm %>%predict(test)
pred
## 1 2 3 4 5 6 7 8
## 74980.24 79305.54 83630.83 85072.60 77481.51 83248.57 84690.34 89015.64
## 9 10 11 12 13 14 15 16
## 90457.40 76419.51 77861.28 79303.05 80744.81 87953.64 76338.50 83547.33
## 17 18 19 20
## 74977.75 82186.58 86511.88 87953.64
#Verification with actual value and predicted values
RMSE <- RMSE(pred,test$Total)
RMSE
## [1] 2570.844
R2 <- R2(pred,test$Total)
R2
## [1] 0.7645348
2.4)Logistic Regression for Causes
#Partitioning into train and test
set.seed(123)
train_samples <- top5causes$Total %>%
createDataPartition(p=0.80,list=FALSE)
train <- top5causes[train_samples,]
test <- top5causes[-train_samples,]
as.factor(top5causes$Total)
## [1] 74067 75891 78419 78690 77022 85675 84575 86225 85364 90476 89927 81524
## [13] 74067 75891 78419 78690 77022 85675 84575 86225 85364 90476 89927 81524
## [25] 74067 75891 78419 78690 77022 85675 84575 86225 85364 90476 89927 81524
## [37] 74067 75891 78419 78690 77022 85675 84575 86225 85364 90476 89927 81524
## [49] 74067 75891 78419 78690 77022 85675 84575 86225 85364 90476 89927 81524
## 12 Levels: 74067 75891 77022 78419 78690 81524 84575 85364 85675 ... 90476
# Training model
logistic_modelC <- glm(Total~Year+Type, data = train)
logistic_modelC
##
## Call: glm(formula = Total ~ Year + Type, data = train)
##
## Coefficients:
## (Intercept)
## -2.237e+06
## Year
## 1.156e+03
## TypeInsanity/Mental Illness
## -6.572e+01
## TypeLove Affairs
## -1.386e+01
## TypeOther Prolonged Illness
## -2.740e+02
## TypeSudden change in Economic Status or Bankruptcy
## -2.213e+02
##
## Degrees of Freedom: 47 Total (i.e. Null); 42 Residual
## Null Deviance: 1.306e+09
## Residual Deviance: 515300000 AIC: 927.3
pred <- logistic_modelC %>%
predict(test)
pred
## 1 2 3 4 5 6 7 8
## 78000.87 83781.38 84937.48 77053.07 78209.17 79365.28 80521.38 82833.58
## 9 10 11 12
## 86301.89 85197.64 75741.42 86146.34
RMSE <- RMSE(pred,test$Total)
RMSE
## [1] 2281.396
R2 <- R2(pred,test$Total)
R2
## [1] 0.9157739
test
## # A tibble: 12 x 3
## # Groups: Year [9]
## Year Type Total
## <int> <chr> <int>
## 1 2003 Other Prolonged Illness 78419
## 2 2008 Other Prolonged Illness 86225
## 3 2009 Other Prolonged Illness 85364
## 4 2002 Insanity/Mental Illness 75891
## 5 2003 Insanity/Mental Illness 78419
## 6 2004 Insanity/Mental Illness 78690
## 7 2005 Insanity/Mental Illness 77022
## 8 2007 Insanity/Mental Illness 84575
## 9 2010 Insanity/Mental Illness 90476
## 10 2009 Love Affairs 85364
## 11 2001 Sudden change in Economic Status or Bankruptcy 74067
## 12 2010 Sudden change in Economic Status or Bankruptcy 90476
3)Top 3 states filtration
topstate<-df2%>%filter(!State %in% c("Total (All India)","Total (States)","Total (Uts)"))%>%select(State,Year,Total) %>% group_by(State)%>%
summarise(Total=sum(Total)) %>% arrange(desc(Total))%>% head(10)
topstate<-as.data.frame(topstate)
topstate
## State Total
## 1 Maharashtra 855611
## 2 West Bengal 709969
## 3 Andhra Pradesh 703486
## 4 Tamil Nadu 696957
## 5 Karnataka 594641
## 6 Kerala 472724
## 7 Madhya Pradesh 391781
## 8 Gujarat 296395
## 9 Chhattisgarh 236825
## 10 Odisha 206601
topstate1<-df2%>%filter(!State %in% c("Total (All India)","Total (States)","Total (Uts)"))%>%
select(State,Year,Total) %>% group_by(Year,State="Maharashtra")%>%
summarise(Total=sum(Total)) %>% arrange(desc(State))
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
topstate2<-df2%>%filter(!State %in% c("Total (All India)","Total (States)","Total (Uts)"))%>%
select(State,Year,Total) %>% group_by(Year,State="West Bengal")%>%
summarise(Total=sum(Total)) %>% arrange(desc(State))
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
topstate3<-df2%>%filter(!State %in% c("Total (All India)","Total (States)","Total (Uts)"))%>%
select(State,Year,Total) %>% group_by(Year,State="Andhra Pradesh")%>%
summarise(Total=sum(Total)) %>% arrange(desc(State))
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
top3state=rbind(topstate1,topstate2,topstate3)
#View(top3state)
3.1)TOP 3 STATES MLR
# Extracting the needed year and suicide count columns
#Testing co relation
#cor(suicide_count_overyears$Year,suicide_count_overyears$total_case)
#cor.test(suicide_count_overyears$Year,suicide_count_overyears$total_case)
#Partitioning into train and test
set.seed(123)
train_samples <- top3state$Year %>% createDataPartition(p=0.80,list=FALSE)
train <- top3state[train_samples,]
test <- top3state[-train_samples,]
# LR Model creation
model <- lm(Total~Year+State,data=train)
summary(model)
##
## Call:
## lm(formula = Total ~ Year + State, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18679 -2843 1619 3272 17744
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.781e+07 1.002e+06 -17.781 <2e-16 ***
## Year 9.133e+03 4.993e+02 18.293 <2e-16 ***
## StateMaharashtra -5.237e+02 4.081e+03 -0.128 0.899
## StateWest Bengal 1.444e+03 4.200e+03 0.344 0.734
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9565 on 28 degrees of freedom
## Multiple R-squared: 0.9247, Adjusted R-squared: 0.9166
## F-statistic: 114.5 on 3 and 28 DF, p-value: 7.941e-16
#Make predictions
pred <- model %>%
predict(test)
pred
## 1 2 3 4
## 555472.2 466106.9 502639.9 528596.1
#Verification with actual value and predicted values
RMSE <- RMSE(pred,test$Total)
RMSE
## [1] 9520.142
R2 <- R2(pred,test$Total)
R2
## [1] 0.947362
hist(model$residuals)

qqnorm(model$residuals,ylab = "Residuals")
qqline(model$residuals)

# Prediction
new.speeds <- data.frame(
Year = c(2013, 2014, 2025) , State = c("Maharashtra","West Bengal","Maharashtra"))
#(agedf)
predict(model, newdata = new.speeds)
## 1 2 3
## 573738.7 584839.2 683337.7
#future_years=data.frame(Year=c(2013:2033))
#future_years$total_case <- model %>%
# predict(future_years)
3.2)Lasso for top 3 states
#install.packages("glmnet")
library(glmnet)
#perform k-fold cross-validation to find optimal lambda value
cv_model <- cv.glmnet(data.matrix(top3state[, c('Year','State')]), top3state$Total, alpha = 1)
cv_model
##
## Call: cv.glmnet(x = data.matrix(top3state[, c("Year", "State")]), y = top3state$Total, alpha = 1)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 1124 37 97255652 23366319 1
## 1se 4536 22 116736759 29342687 1
#find optimal lambda value that minimizes test MSE
best_lambda <- cv_model$lambda.min
best_lambda
## [1] 1123.63
#[1] best_lambda=1123.63
#produce plot of test MSE by lambda value
plot(cv_model)

#Best Lasso model
best_model <- glmnet(data.matrix(top3state[, c('Year','State')]), top3state$Total, alpha = 1, lambda = best_lambda)
coef(best_model)
## 3 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -17432523.740
## Year 8944.749
## State .
#Prediction
#define new observation
#new = matrix(c(2005,'Maharashtra'), nrow=1, ncol=2)
#new
#use lasso regression model to predict response value
#predict(best_model, s = best_lambda, newx = new)
x=data.matrix(top3state[, c('Year','State')])
y=top3state$Total
#Metrics
y_predicted <- predict(best_model, s = best_lambda, newx = x)
#find SST and SSE
sst <- sum((y - mean(y))^2)
sse <- sum((y_predicted - y)^2)
#find R-Squared
rsq <- 1 - sse/sst
rsq
## [1] 0.9261391
3.3)SVM for top 3 states
top3state
## # A tibble: 36 x 3
## # Groups: Year [12]
## Year State Total
## <int> <chr> <int>
## 1 2001 Maharashtra 467928
## 2 2002 Maharashtra 476738
## 3 2003 Maharashtra 482322
## 4 2004 Maharashtra 486323
## 5 2005 Maharashtra 486115
## 6 2006 Maharashtra 512676
## 7 2007 Maharashtra 522233
## 8 2008 Maharashtra 531216
## 9 2009 Maharashtra 539470
## 10 2010 Maharashtra 564083
## # ... with 26 more rows
library(e1071)
set.seed(123)
train_samples <- top3state$Total %>% createDataPartition(p=0.80,list=FALSE)
train <- top3state[train_samples,]
test <- top3state[-train_samples,]
#agedf
# MLR Model creation
statemodelsvm <- svm(Total~State,data=train, kernel = 'linear')
summary(statemodelsvm)
##
## Call:
## svm(formula = Total ~ State, data = train, kernel = "linear")
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: linear
## cost: 1
## gamma: 0.3333333
## epsilon: 0.1
##
##
## Number of Support Vectors: 32
#Make predictions
pred <- statemodelsvm %>%predict(test)
pred
## 1 2 3 4
## 517453 517456 517456 517453
#Verification with actual value and predicted values
RMSE <- RMSE(pred,test$Total)
RMSE
## [1] 38165.66
R2 <- R2(pred,test$Total)
R2
## [1] 0.8750611
3.4)Logistic Regression for top 3 states
# Splitting dataset
set.seed(123)
train_samples <- top3state$Year %>% createDataPartition(p=0.65,list=FALSE)
#train_samples
head(train_samples)
## Resample1
## [1,] 2
## [2,] 3
## [3,] 4
## [4,] 5
## [5,] 7
## [6,] 9
train <- top3state[train_samples,]
test <- top3state[-train_samples,]
train
## # A tibble: 24 x 3
## # Groups: Year [11]
## Year State Total
## <int> <chr> <int>
## 1 2002 Maharashtra 476738
## 2 2003 Maharashtra 482322
## 3 2004 Maharashtra 486323
## 4 2005 Maharashtra 486115
## 5 2007 Maharashtra 522233
## 6 2009 Maharashtra 539470
## 7 2010 Maharashtra 564083
## 8 2011 Maharashtra 564376
## 9 2012 Maharashtra 547894
## 10 2002 West Bengal 476738
## # ... with 14 more rows
test
## # A tibble: 12 x 3
## # Groups: Year [8]
## Year State Total
## <int> <chr> <int>
## 1 2001 Maharashtra 467928
## 2 2006 Maharashtra 512676
## 3 2008 Maharashtra 531216
## 4 2001 West Bengal 467928
## 5 2010 West Bengal 564083
## 6 2001 Andhra Pradesh 467928
## 7 2005 Andhra Pradesh 486115
## 8 2006 Andhra Pradesh 512676
## 9 2007 Andhra Pradesh 522233
## 10 2009 Andhra Pradesh 539470
## 11 2010 Andhra Pradesh 564083
## 12 2011 Andhra Pradesh 564376
as.factor(top3state$Total)
## [1] 467928 476738 482322 486323 486115 512676 522233 531216 539470 564083
## [11] 564376 547894 467928 476738 482322 486323 486115 512676 522233 531216
## [21] 539470 564083 564376 547894 467928 476738 482322 486323 486115 512676
## [31] 522233 531216 539470 564083 564376 547894
## 12 Levels: 467928 476738 482322 486115 486323 512676 522233 531216 ... 564376
# Training model
logistic_model <- glm(Total~Year+State, data = train)
logistic_model
##
## Call: glm(formula = Total ~ Year + State, data = train)
##
## Coefficients:
## (Intercept) Year StateMaharashtra StateWest Bengal
## -17222074 8838 3335 2084
##
## Degrees of Freedom: 23 Total (i.e. Null); 20 Residual
## Null Deviance: 2.408e+10
## Residual Deviance: 1.825e+09 AIC: 513.6
pred <- logistic_model %>%
predict(test)
pred
## 1 2 3 4 5 6 7 8
## 465812.2 510001.5 527677.2 464560.5 544101.2 462476.9 497828.3 506666.2
## 9 10 11 12
## 515504.0 533179.7 542017.6 550855.5
RMSE <- RMSE(pred,test$Total)
RMSE
## [1] 10771.61
R2 <- R2(pred,test$Total)
R2
## [1] 0.9635767
test
## # A tibble: 12 x 3
## # Groups: Year [8]
## Year State Total
## <int> <chr> <int>
## 1 2001 Maharashtra 467928
## 2 2006 Maharashtra 512676
## 3 2008 Maharashtra 531216
## 4 2001 West Bengal 467928
## 5 2010 West Bengal 564083
## 6 2001 Andhra Pradesh 467928
## 7 2005 Andhra Pradesh 486115
## 8 2006 Andhra Pradesh 512676
## 9 2007 Andhra Pradesh 522233
## 10 2009 Andhra Pradesh 539470
## 11 2010 Andhra Pradesh 564083
## 12 2011 Andhra Pradesh 564376
4)WRT AGE GRP Filtration
agedf<-df2%>% select(Year,Age_group,Total)%>%
filter(!Age_group=="0-100")%>%
filter(!Age_group=="0-100+")%>%
group_by(Year,Age_group)%>%
summarise(Total=sum(Total))%>% arrange(desc(Age_group))
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
agedf<-as.data.frame(agedf)
4.1)AGE GROUP MLR
# Extracting the needed year and suicide count columns
#Testing co relation
#cor(suicide_count_overyears$Year,suicide_count_overyears$total_case)
#cor.test(suicide_count_overyears$Year,suicide_count_overyears$total_case)
#Partitioning into train and test
set.seed(123)
train_samples <- agedf$Year %>% createDataPartition(p=0.65,list=FALSE)
train <- agedf[train_samples,]
test <- agedf[-train_samples,]
# LR Model creation
model <- lm(Total~Year+Age_group,data=train)
summary(model)
##
## Call:
## lm(formula = Total ~ Year + Age_group, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5598.3 -1889.8 124.6 1879.7 6777.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1431382.0 279654.4 -5.118 1.21e-05 ***
## Year 715.9 139.4 5.136 1.14e-05 ***
## Age_group15-29 94742.6 1417.1 66.856 < 2e-16 ***
## Age_group30-44 89052.6 1554.9 57.273 < 2e-16 ***
## Age_group45-59 49284.1 1548.2 31.834 < 2e-16 ***
## Age_group60+ 15361.8 1493.2 10.288 5.61e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2983 on 34 degrees of freedom
## Multiple R-squared: 0.9951, Adjusted R-squared: 0.9944
## F-statistic: 1375 on 5 and 34 DF, p-value: < 2.2e-16
#Make predictions
pred <- model %>%
predict(test)
pred
## 1 5 8 10 16 18 20
## 16516.515 19380.157 21527.888 22959.709 52586.557 54018.377 55450.198
## 22 24 26 27 28 29 35
## 56882.019 58313.840 90923.214 91639.124 92355.034 93070.945 97366.407
## 38 44 49 55 57 59
## 96613.230 100908.692 1154.720 5450.183 6882.004 8313.824
#Verification with actual value and predicted values
RMSE <- RMSE(pred,test$Total)
RMSE
## [1] 2826.255
R2 <- R2(pred,test$Total)
R2
## [1] 0.9947472
hist(model$residuals)

qqnorm(model$residuals,ylab = "Residuals")
qqline(model$residuals)

# Prediction
new.speeds <- data.frame(
Year = c(2013, 2013, 2013) , Age_group = c("30-44","45-59","0-14"))
#(agedf)
predict(model, newdata = new.speeds)
## 1 2 3
## 98798.228 59029.750 9745.645
#future_years=data.frame(Year=c(2013:2033))
#future_years$total_case <- model %>%
# predict(future_years)
4.2)SVM Model for Age wrt Total
library(e1071)
set.seed(123)
train_samples <- agedf$Total %>% createDataPartition(p=0.80,list=FALSE)
train <- agedf[train_samples,]
test <- agedf[-train_samples,]
#agedf
# MLR Model creation
modelsvm1 <- svm(Total~Year+Age_group,data=train, kernel = 'linear')
summary(modelsvm1)
##
## Call:
## svm(formula = Total ~ Year + Age_group, data = train, kernel = "linear")
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: linear
## cost: 1
## gamma: 0.1666667
## epsilon: 0.1
##
##
## Number of Support Vectors: 11
#Make predictions
pred <- modelsvm1 %>%predict(test)
pred
## 7 10 11 24 26 33 36
## 22320.441 24496.043 25221.244 58521.288 89093.369 94169.773 96345.375
## 40 47 53 54 58
## 96157.551 101233.955 4757.775 5482.976 8383.778
#Verification with actual value and predicted values
RMSE <- RMSE(pred,test$Total)
RMSE
## [1] 3231.497
R2 <- R2(pred,test$Total)
R2
## [1] 0.9948775
#find SST and SSE
sst <- sum((y - mean(y))^2)
sse <- sum((y_predicted - y)^2)
#find R-Squared
rsq <- 1-sse/sst
rsq
## [1] 0.9261391
4.3)Lasso for age grp
#install.packages("glmnet")
library(glmnet)
#Partitioning into train and test
set.seed(123)
train_samples <- agedf$Total %>%
createDataPartition(p=0.80,list=FALSE)
train <- agedf[train_samples,]
test <- agedf[-train_samples,]
#agedf
#install.packages("glmnet")
#library(glmnet)
#perform k-fold cross-validation to find optimal lambda value
cv_model <- cv.glmnet(data.matrix(train[, c('Year','Age_group')]), train$Total, alpha = 0.5)
cv_model
##
## Call: cv.glmnet(x = data.matrix(train[, c("Year", "Age_group")]), y = train$Total, alpha = 0.5)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 6930 1 1.487e+09 161516174 0
## 1se 6930 1 1.487e+09 161516174 0
#find optimal lambda value that minimizes test MSE
best_lambda <- cv_model$lambda.min
best_lambda
## [1] 6929.659
#[1] best_lambda=2565.932
#produce plot of test MSE by lambda value
plot(cv_model)

#Best Lasso model
#t=data.matrix(train[, c('Year','State','Age_group')])
#t
best_model <- glmnet(data.matrix(train[, c('Year','Age_group')]), train$Total, alpha = 0.5, lambda = best_lambda)
coef(best_model)
## 3 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 5.521567e+04
## Year 4.214766e-13
## Age_group .
#Prediction
#define new observation
#new = matrix(c(2015,'Maharashtra',"45-59"), nrow=1, ncol=3)
#data.matrix(c(2015,'Maharashtra',"45-59"))
#new
#use lasso regression model to predict response value
#predict(best_model, s = best_lambda, newx = new)
x=data.matrix(test[, c('Year','Age_group')])
#x
y=test$Total
#Metrics
y_predicted <- predict(best_model, s = best_lambda, newx = x)
RMSE <- RMSE(y,y_predicted)
RMSE
## [1] 40523.15
R2 <- R2(y,y_predicted)
R2
## [,1]
## s1 0.0566469
#find SST and SSE
#sst <- sum((y - mean(y))^2)
#sse <- sum((y_predicted - y)^2)
#find R-Squared
#rsq <- 1-sse/sst
#rsq
4.4)Logistic Regression for age grp
#Partitioning into train and test
set.seed(123)
train_samples <- agedf$Year %>% createDataPartition(p=0.65,list=FALSE)
train <- agedf[train_samples,]
test <- agedf[-train_samples,]
train
## Year Age_group Total
## 2 2002 60+ 19502
## 3 2003 60+ 20131
## 4 2004 60+ 19608
## 6 2006 60+ 20288
## 7 2007 60+ 20443
## 9 2009 60+ 21485
## 11 2011 60+ 21457
## 12 2012 60+ 22150
## 13 2001 45-59 48788
## 14 2002 45-59 50101
## 15 2003 45-59 51731
## 17 2005 45-59 52429
## 19 2007 45-59 56164
## 21 2009 45-59 58020
## 23 2011 45-59 58032
## 25 2001 30-44 84609
## 30 2006 30-44 95655
## 31 2007 30-44 95370
## 32 2008 30-44 98751
## 33 2009 30-44 98341
## 34 2010 30-44 98670
## 36 2012 30-44 92987
## 37 2001 15-29 93274
## 39 2003 15-29 95906
## 40 2004 15-29 95084
## 41 2005 15-29 94026
## 42 2006 15-29 101304
## 43 2007 15-29 100250
## 45 2009 15-29 102474
## 46 2010 15-29 109118
## 47 2011 15-29 108921
## 48 2012 15-29 100139
## 50 2002 0-14 5189
## 51 2003 0-14 4923
## 52 2004 0-14 5217
## 53 2005 0-14 4850
## 54 2006 0-14 4773
## 56 2008 0-14 4322
## 58 2010 0-14 5571
## 60 2012 0-14 4461
test
## Year Age_group Total
## 1 2001 60+ 18613
## 5 2005 60+ 20274
## 8 2008 60+ 19781
## 10 2010 60+ 22748
## 16 2004 45-59 51213
## 18 2006 45-59 54432
## 20 2008 45-59 55497
## 22 2010 45-59 58778
## 24 2012 45-59 57267
## 26 2002 30-44 86718
## 27 2003 30-44 87929
## 28 2004 30-44 87807
## 29 2005 30-44 86708
## 35 2011 30-44 99956
## 38 2002 15-29 94394
## 44 2008 15-29 102831
## 49 2001 0-14 5632
## 55 2007 0-14 4732
## 57 2009 0-14 4848
## 59 2011 0-14 4840
as.factor(agedf$Total)
## [1] 18613 19502 20131 19608 20274 20288 20443 19781 21485 22748
## [11] 21457 22150 48788 50101 51731 51213 52429 54432 56164 55497
## [21] 58020 58778 58032 57267 84609 86718 87929 87807 86708 95655
## [31] 95370 98751 98341 98670 99956 92987 93274 94394 95906 95084
## [41] 94026 101304 100250 102831 102474 109118 108921 100139 5632 5189
## [51] 4923 5217 4850 4773 4732 4322 4848 5571 4840 4461
## 60 Levels: 4322 4461 4732 4773 4840 4848 4850 4923 5189 5217 5571 ... 109118
# Training model
logistic_modelAG <- glm(Total~Year+Age_group, data = train)
logistic_modelAG
##
## Call: glm(formula = Total ~ Year + Age_group, data = train)
##
## Coefficients:
## (Intercept) Year Age_group15-29 Age_group30-44 Age_group45-59
## -1431382.0 715.9 94742.6 89052.6 49284.1
## Age_group60+
## 15361.8
##
## Degrees of Freedom: 39 Total (i.e. Null); 34 Residual
## Null Deviance: 6.146e+10
## Residual Deviance: 302600000 AIC: 761.1
pred <- logistic_modelAG %>%
predict(test)
pred
## 1 5 8 10 16 18 20
## 16516.515 19380.157 21527.888 22959.709 52586.557 54018.377 55450.198
## 22 24 26 27 28 29 35
## 56882.019 58313.840 90923.214 91639.124 92355.034 93070.945 97366.407
## 38 44 49 55 57 59
## 96613.230 100908.692 1154.720 5450.183 6882.004 8313.824
RMSE <- RMSE(pred,test$Total)
RMSE
## [1] 2826.255
R2 <- R2(pred,test$Total)
R2
## [1] 0.9947472
test
## Year Age_group Total
## 1 2001 60+ 18613
## 5 2005 60+ 20274
## 8 2008 60+ 19781
## 10 2010 60+ 22748
## 16 2004 45-59 51213
## 18 2006 45-59 54432
## 20 2008 45-59 55497
## 22 2010 45-59 58778
## 24 2012 45-59 57267
## 26 2002 30-44 86718
## 27 2003 30-44 87929
## 28 2004 30-44 87807
## 29 2005 30-44 86708
## 35 2011 30-44 99956
## 38 2002 15-29 94394
## 44 2008 15-29 102831
## 49 2001 0-14 5632
## 55 2007 0-14 4732
## 57 2009 0-14 4848
## 59 2011 0-14 4840
5)Wrt State,Age grp,Year Culminated Model Filtration
####
# Extracting the needed year and suicide count columns
topstate1<-df2%>%filter(!State %in% c("Total (All India)","Total (States)","Total (Uts)"))%>%
select(State,Year,Age_group,Total) %>%
filter(!Age_group=="0-100")%>%
filter(!Age_group=="0-100+")%>%
group_by(Year,Age_group,State="Maharashtra")%>%
summarise(Total=sum(Total)) %>% arrange(desc(State))
## `summarise()` has grouped output by 'Year', 'Age_group'. You can override using the `.groups` argument.
topstate2<-df2%>%filter(!State %in% c("Total (All India)","Total (States)","Total (Uts)"))%>%
select(State,Year,Age_group,Total) %>%
filter(!Age_group=="0-100")%>%
filter(!Age_group=="0-100+")%>%
group_by(Year,Age_group,State="West Bengal")%>%
summarise(Total=sum(Total)) %>% arrange(desc(State))
## `summarise()` has grouped output by 'Year', 'Age_group'. You can override using the `.groups` argument.
topstate3<-df2%>%filter(!State %in% c("Total (All India)","Total (States)","Total (Uts)"))%>%
select(State,Year,Age_group,Total) %>%
filter(!Age_group=="0-100")%>%
filter(!Age_group=="0-100+")%>%
group_by(Year,Age_group,State="Andhra Pradesh")%>%
summarise(Total=sum(Total)) %>% arrange(desc(State))
## `summarise()` has grouped output by 'Year', 'Age_group'. You can override using the `.groups` argument.
#topstate3
model=rbind(topstate1,topstate2,topstate3)
modellasso=rbind(topstate1,topstate2,topstate3)
modelsvm=rbind(topstate1,topstate2,topstate3)
modelLog=rbind(topstate1,topstate2,topstate3)
5.1)Wrt State,Age grp,Year Culminated Model Lasso
#Testing co relation
#cor(suicide_count_overyears$Year,suicide_count_overyears$total_case)
#cor.test(suicide_count_overyears$Year,suicide_count_overyears$total_case)
#Partitioning into train and test
set.seed(123)
train_samples <- modellasso$Total %>%
createDataPartition(p=0.80,list=FALSE)
train <- modellasso[train_samples,]
test <- modellasso[-train_samples,]
#agedf
#install.packages("glmnet")
library(glmnet)
#perform k-fold cross-validation to find optimal lambda value
cv_model <- cv.glmnet(data.matrix(train[, c('Year','State','Age_group')]), train$Total, alpha = 0)
cv_model
##
## Call: cv.glmnet(x = data.matrix(train[, c("Year", "State", "Age_group")]), y = train$Total, alpha = 0)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 154956 36 1.481e+09 78984660 3
## 1se 4021135 1 1.483e+09 71572128 3
#find optimal lambda value that minimizes test MSE
best_lambda <- cv_model$lambda.min
best_lambda
## [1] 154955.6
#[1] best_lambda=2565.932
#produce plot of test MSE by lambda value
plot(cv_model)

#Best Lasso model
#t=data.matrix(train[, c('Year','State','Age_group')])
#t
best_model <- glmnet(data.matrix(train[, c('Year','State','Age_group')]), train$Total, alpha = 0, lambda = best_lambda)
coef(best_model)
## 4 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -403899.5431
## Year 228.4241
## State 667.3442
## Age_group -375.1431
#Prediction
#define new observation
#new = matrix(c(2015,'Maharashtra',"45-59"), nrow=1, ncol=3)
#data.matrix(c(2015,'Maharashtra',"45-59"))
#new
#use lasso regression model to predict response value
#predict(best_model, s = best_lambda, newx = new)
x=data.matrix(test[, c('Year','State','Age_group')])
#x
y=test$Total
#Metrics
y_predicted <- predict(best_model, s = best_lambda, newx = x)
y_predicted
## s1
## [1,] 54136.65
## [2,] 52636.08
## [3,] 54218.36
## [4,] 53468.07
## [5,] 54821.93
## [6,] 54381.77
## [7,] 54006.63
## [8,] 54838.62
## [9,] 55295.47
## [10,] 54920.32
## [11,] 54804.00
## [12,] 54428.85
## [13,] 54053.71
## [14,] 53760.27
## [15,] 55489.27
## [16,] 55114.13
## [17,] 55946.12
## [18,] 54820.69
## [19,] 55049.11
## [20,] 56027.82
## [21,] 54902.39
## [22,] 55881.10
## [23,] 55359.24
## [24,] 57316.66
## [25,] 53322.59
## [26,] 52947.45
## [27,] 53551.01
## [28,] 53404.30
## [29,] 53029.15
## [30,] 53257.58
## [31,] 54611.43
## [32,] 54236.29
## [33,] 54464.71
## [34,] 54317.99
## [35,] 55003.26
## [36,] 55981.97
RMSE <- RMSE(y,y_predicted)
RMSE
## [1] 37658.23
R2 <- R2(y,y_predicted)
R2
## [,1]
## s1 0.05828778
#find SST and SSE
#sst <- sum((y - mean(y))^2)
#sse <- sum((y_predicted - y)^2)
#sst
#sse
#find R-Squared
#rsq <- 1-sse/sst
#rsq
5.2)SVM Wrt State,Age grp,Year Culminated Model
library(e1071)
set.seed(123)
train_samples <- modelsvm$Total %>%
createDataPartition(p=0.80,list=FALSE)
train <- modelsvm[train_samples,]
test <- modelsvm[-train_samples,]
#agedf
# MLR Model creation
modelsvm <- svm(Total~Year+Age_group+State,data=train, kernel = 'linear')
summary(modelsvm)
##
## Call:
## svm(formula = Total ~ Year + Age_group + State, data = train, kernel = "linear")
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: linear
## cost: 1
## gamma: 0.125
## epsilon: 0.1
##
##
## Number of Support Vectors: 35
#Make predictions
pred <- modelsvm %>%predict(test)
pred
## 1 2 3 4 5 6
## 552.9759 17064.7043 96668.2941 51764.1445 2986.9527 55009.4469
## 7 8 9 10 11 12
## 21932.6578 56632.0980 58254.7492 25177.9601 541.3441 95034.0111
## 13 14 15 16 17 18
## 87891.0185 18675.7237 2975.3208 97467.9878 4597.9720 54186.4895
## 19 20 21 22 23 24
## 54997.8150 100713.2901 22732.3516 94381.6232 24355.0027 9465.9255
## 25 26 27 28 29 30
## 95842.4252 88699.4326 96653.7507 90322.0838 52560.9268 53372.2524
## 31 32 33 34 35 36
## 4595.0605 99087.7275 99899.0531 93567.3861 96001.3628 9463.0140
#Verification with actual value and predicted values
RMSE <- RMSE(pred,test$Total)
RMSE
## [1] 2645.219
R2 <- R2(pred,test$Total)
R2
## [1] 0.9950006
5.3)LOGISTIC REGRESSION State,Age grp,Year Culminated Model
##BIG MODEL LOGISTIC REGRESSION
# Splitting dataset
set.seed(123)
train_samplesbm <- modelLog$Year %>% createDataPartition(p=0.65,list=FALSE)
#train_samples
head(train_samplesbm)
## Resample1
## [1,] 3
## [2,] 4
## [3,] 5
## [4,] 7
## [5,] 8
## [6,] 9
train <- modelLog[train_samplesbm,]
test <- modelLog[-train_samplesbm,]
#train
#test
as.factor(modelLog$Total)
## [1] 5632 93274 84609 48788 18613 5189 94394 86718 50101 19502
## [11] 4923 95906 87929 51731 20131 5217 95084 87807 51213 19608
## [21] 4850 94026 86708 52429 20274 4773 101304 95655 54432 20288
## [31] 4732 100250 95370 56164 20443 4322 102831 98751 55497 19781
## [41] 4848 102474 98341 58020 21485 5571 109118 98670 58778 22748
## [51] 4840 108921 99956 58032 21457 4461 100139 92987 57267 22150
## [61] 5632 93274 84609 48788 18613 5189 94394 86718 50101 19502
## [71] 4923 95906 87929 51731 20131 5217 95084 87807 51213 19608
## [81] 4850 94026 86708 52429 20274 4773 101304 95655 54432 20288
## [91] 4732 100250 95370 56164 20443 4322 102831 98751 55497 19781
## [101] 4848 102474 98341 58020 21485 5571 109118 98670 58778 22748
## [111] 4840 108921 99956 58032 21457 4461 100139 92987 57267 22150
## [121] 5632 93274 84609 48788 18613 5189 94394 86718 50101 19502
## [131] 4923 95906 87929 51731 20131 5217 95084 87807 51213 19608
## [141] 4850 94026 86708 52429 20274 4773 101304 95655 54432 20288
## [151] 4732 100250 95370 56164 20443 4322 102831 98751 55497 19781
## [161] 4848 102474 98341 58020 21485 5571 109118 98670 58778 22748
## [171] 4840 108921 99956 58032 21457 4461 100139 92987 57267 22150
## 60 Levels: 4322 4461 4732 4773 4840 4848 4850 4923 5189 5217 5571 ... 109118
# Training model
logistic_modelbm <- glm(Total~Year+Age_group+State, data = train)
logistic_modelbm
##
## Call: glm(formula = Total ~ Year + Age_group + State, data = train)
##
## Coefficients:
## (Intercept) Year Age_group15-29 Age_group30-44
## -1520152.5 760.0 95102.8 88184.2
## Age_group45-59 Age_group60+ StateMaharashtra StateWest Bengal
## 49494.8 15737.9 -119.4 -124.0
##
## Degrees of Freedom: 119 Total (i.e. Null); 112 Residual
## Null Deviance: 1.713e+11
## Residual Deviance: 849300000 AIC: 2251
pred <- logistic_modelbm %>%
predict(test)
pred
## 1 2 3 4 5 6
## 531.0476 95633.8662 1291.0691 97913.9306 90995.3119 52305.8882
## 7 8 9 10 11 12
## 53825.9311 5091.1763 100193.9949 5851.1978 55345.9740 6611.2192
## 13 14 15 16 17 18
## 7371.2407 102474.0593 95555.4406 96315.4621 57626.0384 97075.4835
## 19 20 21 22 23 24
## 526.4169 88710.6168 16264.3217 1286.4383 96389.2569 89470.6383
## 25 26 27 28 29 30
## 50781.2146 17784.3646 97909.2998 18544.3860 98669.3213 53061.2789
## 31 32 33 34 35 36
## 92510.7241 20064.4289 54581.3218 20824.4504 5846.5670 55341.3433
## 37 38 39 40 41 42
## 56101.3647 95550.8099 56861.3862 23864.5362 95753.2167 1410.4195
## 43 44 45 46 47 48
## 50905.1958 17908.3458 2930.4624 91114.6624 18668.3672 53185.2601
## 49 50 51 52 53 54
## 99553.3239 5970.5482 21708.4530 6730.5697 101833.3883 56225.3459
## 55 56 57 58 59 60
## 7490.5911 102593.4097 96434.8125 23988.5174 9010.6340 97194.8340
RMSE <- RMSE(pred,test$Total)
RMSE
## [1] 2808.221
R2 <- R2(pred,test$Total)
R2
## [1] 0.9946365
#test
5.4)Wrt State,Age grp,Year Culminated Model MLR
#Testing co relation
#cor(suicide_count_overyears$Year,suicide_count_overyears$total_case)
#cor.test(suicide_count_overyears$Year,suicide_count_overyears$total_case)
#Partitioning into train and test
set.seed(123)
train_samples <- model$Total %>%
createDataPartition(p=0.80,list=FALSE)
train <- model[train_samples,]
test <- model[-train_samples,]
#agedf
# MLR Model creation
model <- lm(Total~Year+Age_group+State,data=train)
summary(model)
##
## Call:
## lm(formula = Total ~ Year + Age_group + State, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5030.1 -2021.1 -102.4 2051.1 6593.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.506e+06 1.366e+05 -11.024 <2e-16 ***
## Year 7.530e+02 6.808e+01 11.061 <2e-16 ***
## Age_group15-29 9.527e+04 7.521e+02 126.677 <2e-16 ***
## Age_group30-44 8.796e+04 7.395e+02 118.955 <2e-16 ***
## Age_group45-59 4.948e+04 7.520e+02 65.801 <2e-16 ***
## Age_group60+ 1.579e+04 7.389e+02 21.369 <2e-16 ***
## StateMaharashtra -2.883e+02 5.713e+02 -0.505 0.615
## StateWest Bengal -1.262e+02 5.813e+02 -0.217 0.828
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2809 on 136 degrees of freedom
## Multiple R-squared: 0.9949, Adjusted R-squared: 0.9946
## F-statistic: 3760 on 7 and 136 DF, p-value: < 2.2e-16
#Make predictions
pred <- model %>%
predict(test)
pred
## 1 2 3 4 5 6
## 472.9349 16261.6808 97253.4485 51459.6336 2732.0624 54471.8036
## 7 8 9 10 11 12
## 20779.9358 55977.8886 57483.9736 23792.1058 634.9710 95909.3996
## 13 14 15 16 17 18
## 88599.7117 17929.8019 2894.0985 98168.5271 4400.1835 53880.7972
## 19 20 21 22 23 24
## 54633.8397 101180.6971 21695.0144 94624.0517 23201.0994 8918.4385
## 25 26 27 28 29 30
## 96788.6794 89478.9915 97541.7219 90985.0765 52500.9495 53253.9920
## 31 32 33 34 35 36
## 4526.4208 99800.8494 100553.8919 93997.2465 96256.3740 9044.6758
#Verification with actual value and predicted values
RMSE <- RMSE(pred,test$Total)
RMSE
## [1] 2645.477
R2 <- R2(pred,test$Total)
R2
## [1] 0.9951535
hist(model$residuals)

qqnorm(model$residuals,ylab = "Residuals")
qqline(model$residuals)

# Prediction
new.speeds <- data.frame(
Year = c(2013, 2013, 2013,2022,2023,2024) , Age_group = c("30-44","45-59","0-14","15-29","60+","30-44") ,State=c("Maharashtra","West Bengal","Andhra Pradesh","Andhra Pradesh","Maharashtra","West Bengal")
)
#(agedf)
predict(model, newdata = new.speeds)
## 1 2 3 4 5 6
## 97474.186 59152.095 9797.718 111849.529 32828.616 105919.689